/*******************************************************************************
Author: Jason Robey
Date: 5/15/2025
Purpose: Create simulated true birth years by assigning random dates to the
birth date and homicide date for every homicide offender. From these simulated
true birth years, create simulated true birth cohort homicide rates by age. 
Run simulation 1,000 times. Capture the average, minimum, and maximum values. 
Compare these simulated true birth cohort homicide rates to the observed cohort
homicide rates. 
*******************************************************************************/

cls 
clear 
capture log close

*Define using working directories
global path ""

*raw data directory
global rd "$path/"

*working data directory
global wd "$path/"

*log file directory
global ld "$path/"

*output file directory
global od "$path/"

cd $wd

*Data string to append to log file
local date: display %td_CCYY-NN-DD date(c(current_date), "DMY")
local date_string = subinstr(trim("`date'"), "-" , ".", .)

*Starting log file 
log using "$ld/name.`date_string'.log", replace

global scale 100000
global pre "3.2.1"


/*******************************************************************************
Simulating True Cohort Homicide Rates
*******************************************************************************/

clear all 
save $pre.sim_byc.dta, replace emptyok 

clear all
save $pre.sim.dta, replace emptyok 

*Setting seed as date of original simulation
set seed 20250515

forval i = 1/1000 {
	
use 1.2.homicide_ind.dta, clear  
drop if off_age==. 

keep state year n num statefips off_age *_arr

generate hom_date = floor((mdy(1,1,year)-mdy(12,31,year) -1)*runiform() + mdy(1,1,year+1))
format hom_date %td

gen hom_month = month(hom_date)
gen hom_year = year(hom_date)
gen hom_day = day(hom_date)
gen hom_doy = doy(hom_date)

tab hom_day hom_month 

fre hom_month hom_day 

assert hom_year==year 

gen birth_year = hom_year - off_age 
gen birth_date_exact = mdy(hom_month,hom_day,birth_year)
replace birth_date_exact = mdy(2,28,birth_year) if birth_date_exact==. & hom_month==2 & hom_day==29
format birth_date_exact %td

gen birth_date_last = mdy(hom_month,hom_day,birth_year-1)
replace birth_date_last = birth_date_last + 1 
replace birth_date_last = mdy(3,1,birth_year-1) if birth_date_last==. & hom_month==2 & hom_day==29
format birth_date_last %td 

gen birth_date = floor((birth_date_exact - birth_date_last))*runiform() + birth_date_last 
replace birth_date = mdy(3,1,year(birth_date)) if month(birth_date)==2 & day(birth_date)==29 & ((birth_year/4!=int(birth_year/4)))
format birth_date %td

gen birth_month_sim = month(birth_date)
gen birth_day_sim = day(birth_date)

tab birth_day_sim birth_month_sim 

fre birth_day_sim birth_month_sim 

gen birth_year_sim = year(birth_date)
gen birth_year_dif = (birth_year - birth_year_sim)
fre birth_year_dif 

*
preserve 
keep if inrange(off_age,15,19) & inrange(birth_year,1960,1995)
egen birth_year_dif_all = mean(birth_year_dif)
collapse (mean) birth_year_dif, by(birth_year off_age birth_year_dif_all)
gen sim = `i'
append using $pre.sim_byc.dta 
save $pre.sim_byc.dta, replace 
restore 

personage birth_date hom_date, gen(age_sim)

assert off_age==age_sim

collapse (sum) *hom_arr birth_year_dif (mean) by_corr=birth_year_dif, by(birth_year_sim off_age)

rename *age age 

gen year = age + birth_year_sim 

merge 1:1 year age using 1.1.pop_age_nat.dta, keepusing(*pop)
drop if _merge==2 
drop if year>2019
replace apop=. if year==1975

*Categoricals 
recode age (15/19=15 "15-19") (20/24=20 "20-24") (25/29=25 "25-29") ///
	(30/34=30 "30-34") (35/39=35 "35-39") (40/44=40 "40-44") (45/49=45 "45-49") ///
	(50/54=50 "50-54") (55/59 =55 "55-59") (60/64=60 "60-64") (65/69=65 "65-69") ///
	(70/max=70 "70+"), gen(age_cat)

*birth year categories
#delimit ; 
recode birth_year_sim 	
	(1900/1904 = 1900 "1900-04") (1905/1909=1905 "1905-09") 
	(1910/1914 = 1910 "1910-14") (1915/1919=1915 "1915-19") 
	(1920/1924 = 1920 "1920-24") (1925/1929=1925 "1925-29") 
	(1930/1934 = 1930 "1930-34") (1935/1939 = 1935 "1935-39")
	(1940/1944 = 1940 "1940-44") (1945/1949 = 1945 "1945-49")
	(1950/1954=1950 "1950-54") (1955/1959=1955 "1955-59") 
	(1960/1964=1960 "1960-64") (1965/1969=1965 "1965-69") 
	(1970/1974 = 1970 "1970-74") (1975/1979 = 1975 "1975-79") 
	(1980/1984 = 1980 "1980-84") (1985/1989 = 1985 "1985-89") 
	(1990/1994 = 1990 "1990-94") (1995/1999= 1995 "1995-99") 
	(2000/2004=2000 "2000-04")
	, gen(birth_year_cat);
#delimit cr 

collapse (sum) *hom_arr *pop birth_year_dif, by(birth_year_cat age_cat)

merge 1:1 birth_year_cat age_cat using 2.1.1.merge_yearcat.dta
drop if _merge==2 

gen ahar_sim=ahom_arr/apop*$scale 
gen ahar_sim2= ahom_arr/apop_pc*$scale 
gen ahar_sim3 = ahom_arr/apop_rca2*$scale 

list age_cat birth_year_cat ahar_sim ahar_sim2 ahar_sim3 if age_cat==15

gen birth_year_correct = (ahom_arr - birth_year_dif)/ahom_arr 
sum birth_year_correct 

keep if age_cat==15 & inrange(birth_year_cat,1960,1995)
keep age_cat birth_year_cat ahar* birth_year_correct 

assert ahar_sim==ahar_sim3 if age_cat==15
sum birth_year_correct 

gen sim = `i'

append using $pre.sim.dta 

save $pre.sim.dta, replace 

display "***`i'***"
}

table birth_year_cat (var), ///
	stat(min ahar_sim birth_year_correct) ///
	stat(max ahar_sim birth_year_correct) ///
	stat(mean ahar_sim birth_year_correct)

use $pre.sim_byc.dta, clear 

table birth_year (var), ///
	stat(min  birth_year_dif) ///
	stat(max  birth_year_dif) ///
	stat(mean  birth_year_dif)
	
use $pre.sim.dta, clear 

collapse (min) ahar_sim_min=ahar_sim (max) ahar_sim_max=ahar_sim ///
	(mean) ahar_sim_avg=ahar_sim ahar_*, by(age_cat birth_year_cat)

foreach var in pc pc_lead rc rca1 rca2 pc_my pc_both {
	foreach typ in avg min max {
		gen dif_sim_`typ'_`var' = abs((ahar_sim_`typ' - ahar_`var')/ahar_sim_`typ')
		bysort age_cat: egen mean_dif_sim_`typ'_`var' = mean(dif_sim_`typ'_`var')
		bysort age_cat: egen corr_dif_sim_`typ'_`var' = corr(ahar_sim_`typ' ahar_`var')		
	}
}

order age_cat birth_year_cat ahar_sim_avg ahar_pc *if_sim_avg_pc ///
	ahar_pc_lead *if_sim_avg_pc_lead ahar_rc *if_sim_avg_rc, first

***Table 3
preserve 
keep age_cat birth_year_cat ahar_sim_avg ahar_pc *if_sim_avg_pc ///
	ahar_pc_lead *if_sim_avg_pc_lead ahar_rc *if_sim_avg_rc

export excel "$od/Table3.xlsx", replace firstrow(var)
restore 

***Appendix Tables A2, A3, and A4 with further details 
preserve 
keep age_cat birth_year_cat ahar_sim* ahar_pc *if_sim_*_pc ///
	ahar_pc_lead *if_sim_*_pc_lead ahar_rc *if_sim_*_rc ///
	ahar_rca* *if_sim_*rca* ahar_pc_my *if_sim*pc_my 
drop *_log 
order age_cat birth_year_cat ahar*, first 

export excel "$od/AppTable2.xlsx", replace firstrow(var)
restore 

capture log close
